{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 使用selenium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-122-f895b145cbf0>:19: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts)\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//a[@class=\"login__type__container__select-type\"]')\n",
    "# 不要直接click()等相关操作，首先要检查是否通过xpath找到正确的element\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "payload = {\"account\":\"1195583265@qq.com\",\"password\":\"123456dsa\"}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 填写账号密码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//input[@name=\"account\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.clear()\n",
    "element.send_keys(payload['account'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//input[@name=\"password\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.clear()\n",
    "element.send_keys(payload['password'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 登录"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//a[@class=\"btn_login\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 展开"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['CDwindow-F557B17E9C9651B0E44655CEEF42D91C', 'CDwindow-D248000BD6782FC8B40D78B398171222']\n"
     ]
    }
   ],
   "source": [
    "driver.find_element_by_xpath('//div[@class=\"new-creation__menu-item\"]').click()\n",
    "print (driver.window_handles)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 切换窗口"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-130-6c6d5ce6602d>:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 点击超链接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//li[@id=\"js_editor_insertlink\"]').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 选择其他公众号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//button[@class=\"weui-desktop-btn weui-desktop-btn_default\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form//div[@class=\"inner_link_account_area\"]//input[@class=\"weui-desktop-form__input\"]').clear()\n",
    "driver.find_element_by_xpath('//form//div[@class=\"inner_link_account_area\"]//input[@class=\"weui-desktop-form__input\"]').send_keys(\"新媒体\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 点击放大镜"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-icon weui-desktop-icon__search weui-desktop-icon__small\" style=\"width: 20px; height: 20px;\"><!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!---->     <svg viewBox=\"0 0 24 24\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><title>MP/Icon/Search</title> <g id=\"MP/Icon/Search\" stroke=\"none\" stroke-width=\"1\" fill=\"none\" fill-rule=\"evenodd\"><path d=\"M5.78025253,5.78248558 C8.51392257,3.04881554 12.9460774,3.04881554 15.6797475,5.78248558 C18.1730922,8.27583028 18.3922898,12.1821488 16.3373403,14.9239313 L20.6294949,19.2175144 L19.2152814,20.631728 L14.922508,16.3389663 C12.180685,18.394566 8.27384272,18.1755707 5.78025253,15.6819805 C3.04658249,12.9483105 3.04658249,8.51615562 5.78025253,5.78248558 Z M6.8409127,6.84314575 C4.6930291,8.99102935 4.6930291,12.4734367 6.8409127,14.6213203 C8.98879631,16.7692039 12.4712037,16.7692039 14.6190873,14.6213203 C16.7669709,12.4734367 16.7669709,8.99102935 14.6190873,6.84314575 C12.4712037,4.69526215 8.98879631,4.69526215 6.8409127,6.84314575 Z\" id=\"形状\"></path></g></svg> <!----> <!----> <!----> <!----> <!----></div>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//button[@class=\"weui-desktop-icon-btn weui-desktop-search__btn\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/xe4FY1hGDmricMpSObN0iaUCI2a4zErjYzap0aicZloOvclwSsOKS17UjSricSTuSa40vdt3D8YmZqDBrsTUia4FSeg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">新媒体课堂</strong> <i class=\"inner_link_account_wechat\">微信号：xinmeitiketang</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/cs0VId4lkicIIIRVlrIzcq2RSaCwibiaVW12HrY6ibXwlkKwTfoGVibc6pck219E2dobpibITox1kJ5iamDfPx9dS7Libg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">辽宁北方新媒体</strong> <i class=\"inner_link_account_wechat\">微信号：beifangxinmeiti_ln</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">服务号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/sdSe7EUu6dKTqFL5TyqQsVLfEvVGydEDQd0gIEgEoR6RNTXmDPMzbwruodPALicsmnxlMnWw6gQeN0o6wwicEibjg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">新媒体</strong> <i class=\"inner_link_account_wechat\">微信号：tzrb001</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/cqZFQWaUic2K2k1eP7icnkMVmujgylBOMm2DjlIibX8Xfp2ib1zibBZo07I0Uj0h5gHRYXl3m74HVDUcwZxWaBYB8vg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">新媒体</strong> <i class=\"inner_link_account_wechat\">微信号：newmedia888</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/1FuiciaNqx8ibQnkzCh8VVOmSI5aF7ibHuhX9eFZQMnAAESICaiamHcQk7aGTUtPM2DDTo4RbTYJdChYAYw9uicqiaISg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">新媒体管家</strong> <i class=\"inner_link_account_wechat\">微信号：xmtcn123</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">服务号</div></li>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "公众号SERP = main_content"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 解析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "root = fromstring(公众号SERP) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nickname</th>\n",
       "      <th>wechat</th>\n",
       "      <th>img</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>新媒体课堂</td>\n",
       "      <td>微信号：xinmeitiketang</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/xe4FY1hGDmricMp...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>辽宁北方新媒体</td>\n",
       "      <td>微信号：beifangxinmeiti_ln</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/cs0VId4lkicIIIR...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>新媒体</td>\n",
       "      <td>微信号：tzrb001</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/sdSe7EUu6dKTqFL...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>新媒体</td>\n",
       "      <td>微信号：newmedia888</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/cqZFQWaUic2K2k1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>新媒体管家</td>\n",
       "      <td>微信号：xmtcn123</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/1FuiciaNqx8ibQn...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  nickname                  wechat  \\\n",
       "0    新媒体课堂      微信号：xinmeitiketang   \n",
       "1  辽宁北方新媒体  微信号：beifangxinmeiti_ln   \n",
       "2      新媒体             微信号：tzrb001   \n",
       "3      新媒体         微信号：newmedia888   \n",
       "4    新媒体管家            微信号：xmtcn123   \n",
       "\n",
       "                                                 img  \n",
       "0  http://mmbiz.qpic.cn/mmbiz_png/xe4FY1hGDmricMp...  \n",
       "1  http://mmbiz.qpic.cn/mmbiz_png/cs0VId4lkicIIIR...  \n",
       "2  http://mmbiz.qpic.cn/mmbiz_png/sdSe7EUu6dKTqFL...  \n",
       "3  http://mmbiz.qpic.cn/mmbiz_png/cqZFQWaUic2K2k1...  \n",
       "4  http://mmbiz.qpic.cn/mmbiz_png/1FuiciaNqx8ibQn...  "
      ]
     },
     "execution_count": 137,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "主 = root.xpath('//li[@class=\"inner_link_account_item\"]')\n",
    "\n",
    "account_list = []\n",
    "for e in 主:\n",
    "    account_nickname = e.xpath('./div/strong[@class=\"inner_link_account_nickname\"]')[0].text\n",
    "    account_wechat = e.xpath('./div/i[@class=\"inner_link_account_wechat\"]')[0].text\n",
    "    account_img = e.xpath('./div/img/@src')[0]\n",
    "    account = {\"nickname\": account_nickname, \"wechat\": account_wechat, \"img\": account_img,}\n",
    "    account_list.append(account)\n",
    "\n",
    "df_account = pd.DataFrame(account_list)\n",
    "df_account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/xe4FY1hGDmricMpSObN0iaUCI2a4zErjYzap0aicZloOvclwSsOKS17UjSricSTuSa40vdt3D8YmZqDBrsTUia4FSeg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">新媒体课堂</strong> <i class=\"inner_link_account_wechat\">微信号：xinmeitiketang</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div/div/div/div[6]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/div/div[2]/ul/li[1]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n跳转_input = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/input\\')\\n跳转_a = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/a\\')\\n跳转_input.clear()\\n跳转_input.send_keys(2)\\n跳转_a.click()\\n'"
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 跳转testing\n",
    "'''\n",
    "跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "跳转_input.clear()\n",
    "跳转_input.send_keys(2)\n",
    "跳转_a.click()\n",
    "'''"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 跳转上线"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 258]\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# 跳转上限\n",
    "l_e = driver.find_elements_by_xpath('//label[@class=\"weui-desktop-pagination__num\"]')\n",
    "l_e_int  = [int(x.text) for x in l_e] \n",
    "print (l_e_int)\n",
    "print (l_e_int[0]==l_e_int[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(l_e_int[0],l_e_int[-1]+1 ))\n",
    "#print(pages[0:2])\n",
    "pages = list(range(1,l_e_int[-1]+1 ))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [],
   "source": [
    "pages = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [],
   "source": [
    "# global varialbes \n",
    "html_raw = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "\n",
    "        跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "        跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "        跳转_input.clear()\n",
    "        跳转_input.send_keys(p)\n",
    "        跳转_a.click()\n",
    "\n",
    "        time.sleep(1+3*random())\n",
    "\n",
    "        element = driver.find_element_by_xpath('//div[@class=\"inner_link_article_list\"]')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        #print(main_content)\n",
    "        html_raw[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t27\t28\t29\t30\t31\t32\t33\t34\t35\t36\t37\t38\t39\t40\t41\t42\t43\t44\t45\t46\t47\t48\t49\t50\t51\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "2   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "3   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "4   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "5   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "6   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "7   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "8   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "9   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "10  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "11  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "12  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "13  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "14  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "15  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "16  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "17  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "18  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "19  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "20  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "21  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "22  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "23  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "24  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "25  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "26  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "27  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "28  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "29  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "30  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "31  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "32  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "33  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "34  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "35  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "36  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "37  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "38  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "39  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "40  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "41  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "42  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "43  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "44  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "45  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "46  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "47  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "48  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "49  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "50  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "51  <div class=\"weui-desktop-radio-group\"><label c..."
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([html_raw]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stored 'html_raw' (dict)\n"
     ]
    }
   ],
   "source": [
    "%store html_raw\n",
    "import pickle \n",
    "filehandler = open(\"html_raw\", 'wb') \n",
    "pickle.dump(html_raw, filehandler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "50\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "12  <div class=\"weui-desktop-radio-group\"><label c..."
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out = df[~df.duplicated()]\n",
    "print (len(df_out))\n",
    "df[df.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[12]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[12]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "try_again = list(df[df.duplicated()].index)\n",
    "print(try_again)\n",
    "try_again = try_again + list (set(pages).difference(set(df.index.values)))\n",
    "try_again"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 暂存档"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "公众号 = \"新媒体\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"原文件\"\n",
    "df.to_csv(filename.format(公众号=公众号), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "from requests_html import HTMLSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7,6,6,6,5,7,5,7,6,7,8,6,7,5,6,5,6,7,7,6,6,8,5,6,6,6,6,6,5,6,7,6,8,5,7,8,8,6,6,6,6,5,7,5,6,5,6,5,6,5,5,"
     ]
    }
   ],
   "source": [
    "from requests_html import HTMLSession\n",
    "def get_content(link):\n",
    "    session = HTMLSession()\n",
    "    r = session.get(url=link)\n",
    "    content_xpath_1 = '//*[@id=\"js_content\"]//span/text()'\n",
    "    content_xpath_2 = '//*[@id=\"js_content\"]//p/text()'\n",
    "    content_1 = ''.join(r.html.xpath(content_xpath_1))\n",
    "    content_2 = ''.join(r.html.xpath(content_xpath_2))\n",
    "    return content_1 + content_2\n",
    "\n",
    "def parse_html_snippets(_snippet_):\n",
    "    root = fromstring(_snippet_) \n",
    "    title = [x.text for x in root.xpath('//div[@class=\"inner_link_article_title\"]//span[2]')]\n",
    "    create_time = [x.text for x in root.xpath('//div[@class=\"inner_link_article_date\"]')]\n",
    "    link = [x for x in root.xpath('//a/@href')]\n",
    "    content_text = [get_content(x) for x in link]\n",
    "    _df_ = pd.DataFrame({\"title\":title, \"create_time\": create_time, \"link\":link, \"content_text\":content_text})\n",
    "    return(_df_)\n",
    "    \n",
    "l_df = []\n",
    "for p in pages:\n",
    "    _df_ = parse_html_snippets(df.loc[p,\"html_snippets\"])\n",
    "    print (len(_df_), end=\",\")\n",
    "    l_df.append(_df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>郑爽被封杀3个月了，是时候让这个热搜真相大白了！！！</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...</td>\n",
       "      <td>郑爽被封杀后，与她相关联的热搜有两个。一是郑爽77天收入1.6亿，折合日薪208万。明星的高...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>分享——学习视频剪辑必备的12个网站！</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...</td>\n",
       "      <td>不知道大家有没有发现一个趋势，做视频快成运营人均必备技能了。前两年躲得过抖音、快手，今年怕是...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>在听歌这件事上，不要跟爸妈比潮</td>\n",
       "      <td>2021-05-31</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...</td>\n",
       "      <td>授权转自城市画报官方微信CITYZINE（微信号：cityzine）本文作者：倪仕轩今天文章...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>儿童节发圈必备文案！！</td>\n",
       "      <td>2021-05-31</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...</td>\n",
       "      <td>明天过节吗？过节发圈吗？知道发什么吗？？？？看完就知道了！总有一句适合你！六一儿童节发圈必备...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>年轻人互联网玩梗图鉴0.000001</td>\n",
       "      <td>2021-05-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...</td>\n",
       "      <td>你怀念18年的夏天吗 那年我上大学的时候可真是起飞啊 ，我怀念的是18年的 那你告儿诉她得...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>来来来，让我看看昨天的你是什么损（sun）色（sai）？</td>\n",
       "      <td>2021-05-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...</td>\n",
       "      <td>昨天的社交软件是被网易云承包的一天！无论你是刷刷朋友圈还是逛逛微博划划水目光所到之处总能看到...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>负债30亿，关店2000家！中国男装之王，如今比美特斯邦威更惨？</td>\n",
       "      <td>2021-05-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...</td>\n",
       "      <td>来源：金错刀（ID：ijincuodao)；作者： 云摇曾经的男装之王七匹狼，最近过得有点惨...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>当代奶茶现状：料越来越多，水越来越少！</td>\n",
       "      <td>2021-03-18</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...</td>\n",
       "      <td>当代年轻人生存三要素：阳光、空气、水（奶茶）奶茶，已然成了很多现代年轻人的“续命神器”，乏了...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>瓜都吃完了，你倒是说说3000万到底是谁？</td>\n",
       "      <td>2021-03-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...</td>\n",
       "      <td>竹篮打水一场空，3000w的瓜无影无踪昨天说好的3000万粉丝顶流恋爱瓜周二见呢？瓜倒是没看...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>职场“潜规则”：领导想提拔你，从不看努力！</td>\n",
       "      <td>2021-03-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...</td>\n",
       "      <td>做运营有多苦？一个人又当文案，又当设计，还要出方案、做推广，当爹又当妈。只要搞活动，老板开口...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>分享｜PDF转Word的3种转换方式——无限次且免费</td>\n",
       "      <td>2021-03-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...</td>\n",
       "      <td>ok也是没想到这么快就迎来了我们「分享」板块的第二期本期分享如题↑↑↑柚子为大家带来无限次且...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               title create_time  \\\n",
       "0         郑爽被封杀3个月了，是时候让这个热搜真相大白了！！！  2021-06-01   \n",
       "1                分享——学习视频剪辑必备的12个网站！  2021-06-01   \n",
       "2                    在听歌这件事上，不要跟爸妈比潮  2021-05-31   \n",
       "3                        儿童节发圈必备文案！！  2021-05-31   \n",
       "4                 年轻人互联网玩梗图鉴0.000001  2021-05-28   \n",
       "5       来来来，让我看看昨天的你是什么损（sun）色（sai）？  2021-05-27   \n",
       "6   负债30亿，关店2000家！中国男装之王，如今比美特斯邦威更惨？  2021-05-26   \n",
       "7                当代奶茶现状：料越来越多，水越来越少！  2021-03-18   \n",
       "8              瓜都吃完了，你倒是说说3000万到底是谁？  2021-03-17   \n",
       "9              职场“潜规则”：领导想提拔你，从不看努力！  2021-03-16   \n",
       "10        分享｜PDF转Word的3种转换方式——无限次且免费  2021-03-16   \n",
       "\n",
       "                                                 link  \\\n",
       "0   http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...   \n",
       "1   http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...   \n",
       "2   http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...   \n",
       "3   http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...   \n",
       "4   http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...   \n",
       "5   http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...   \n",
       "6   http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...   \n",
       "7   http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...   \n",
       "8   http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...   \n",
       "9   http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...   \n",
       "10  http://mp.weixin.qq.com/s?__biz=MzAxNDYwMjQ1MA...   \n",
       "\n",
       "                                         content_text  \n",
       "0   郑爽被封杀后，与她相关联的热搜有两个。一是郑爽77天收入1.6亿，折合日薪208万。明星的高...  \n",
       "1   不知道大家有没有发现一个趋势，做视频快成运营人均必备技能了。前两年躲得过抖音、快手，今年怕是...  \n",
       "2   授权转自城市画报官方微信CITYZINE（微信号：cityzine）本文作者：倪仕轩今天文章...  \n",
       "3   明天过节吗？过节发圈吗？知道发什么吗？？？？看完就知道了！总有一句适合你！六一儿童节发圈必备...  \n",
       "4    你怀念18年的夏天吗 那年我上大学的时候可真是起飞啊 ，我怀念的是18年的 那你告儿诉她得...  \n",
       "5   昨天的社交软件是被网易云承包的一天！无论你是刷刷朋友圈还是逛逛微博划划水目光所到之处总能看到...  \n",
       "6   来源：金错刀（ID：ijincuodao)；作者： 云摇曾经的男装之王七匹狼，最近过得有点惨...  \n",
       "7   当代年轻人生存三要素：阳光、空气、水（奶茶）奶茶，已然成了很多现代年轻人的“续命神器”，乏了...  \n",
       "8   竹篮打水一场空，3000w的瓜无影无踪昨天说好的3000万粉丝顶流恋爱瓜周二见呢？瓜倒是没看...  \n",
       "9   做运营有多苦？一个人又当文案，又当设计，还要出方案、做推广，当爹又当妈。只要搞活动，老板开口...  \n",
       "10  ok也是没想到这么快就迎来了我们「分享」板块的第二期本期分享如题↑↑↑柚子为大家带来无限次且...  "
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_url_out.loc[0:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.ExcelWriter('data_out/新媒体.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_url_out.to_excel(writer, sheet_name='公众号')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
